import os
import string
import PyPDF2
import nltk
import pandas as pd

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ---------------- DOWNLOAD STOPWORDS ----------------
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')

# ---------------- TEXT CLEANING FUNCTION ----------------
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in STOPWORDS]
    return " ".join(words)

# ---------------- PDF TEXT EXTRACTION ----------------
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text()
    return text

# ---------------- LOAD JOB DESCRIPTION ----------------
JD_PATH = "job_description.pdf"
jd_text_raw = extract_text_from_pdf(JD_PATH)
jd_text = clean_text(jd_text_raw)

# ---------------- LOAD RESUMES ----------------
RESUME_FOLDER = "resumes"
resume_texts = []
resume_names = []

for file in os.listdir(RESUME_FOLDER):
    if file.endswith(".pdf"):
        path = os.path.join(RESUME_FOLDER, file)
        text = extract_text_from_pdf(path)
        resume_texts.append(clean_text(text))
        resume_names.append(file)

# ---------------- TF-IDF VECTORIZATION ----------------
documents = [jd_text] + resume_texts
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# ---------------- COSINE SIMILARITY ----------------
similarity_scores = cosine_similarity(
    tfidf_matrix[0:1],
    tfidf_matrix[1:]
)[0] * 100

# ---------------- CLASSIFICATION LOGIC ----------------
def classify_resume(score):
    if score >= 70:
        return "Highly Matched"
    elif score >= 40:
        return "Moderately Matched"
    else:
        return "Not Matched"

categories = [classify_resume(score) for score in similarity_scores]

# ---------------- FINAL RESULT ----------------
result_df = pd.DataFrame({
    "Resume Name": resume_names,
    "Match Percentage (%)": similarity_scores.round(2),
    "Category": categories
})

result_df = result_df.sort_values(
    by="Match Percentage (%)",
    ascending=False
)

print("\n📊 RESUME CLASSIFICATION AS PER JOB DESCRIPTION\n")
print(result_df.to_string(index=False))

# ---------------- SAVE RESULT ----------------
result_df.to_csv("resume_screening_result.csv", index=False)
print("\n✅ Result saved as resume_screening_result.csv")
